# Software Defect Prediction
import pandas as pd
from sklearn.model_selection import train_test_split
from base import datacleaning
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


def bar3d(corr):
    # index_list = [i for i in range(1, len(corr.columns))]
    index_list = np.arange(0, len(corr.columns), step=1)
    xedges = np.array(corr.columns)
    hist = np.array(corr.values)

    ax = plt.figure().add_subplot(111, projection='3d')
    # 设置作图点的坐标
    xpos, ypos = np.meshgrid(index_list, index_list)
    xpos = xpos.flatten('F')
    ypos = ypos.flatten('F')
    zpos = np.zeros_like(xpos)
    added = np.ones_like(hist)
    hist = hist + added

    # 设置柱形图大小
    dx = 0.7 * np.ones_like(zpos)
    dy = dx.copy()
    dz = hist.flatten()

    # 设置坐标轴标签
    ax.set_xlabel('Features')
    ax.set_ylabel('Features')
    ax.set_zlabel('Relation')

    ax.w_xaxis.set_ticklabels(xedges)
    ax.w_yaxis.set_ticklabels(xedges)

    ax.bar3d(xpos, ypos, zpos, dx, dy, dz, color='b')
    plt.show()


# 添加数据集归一化和特征选择功能
def my_sdp_preprocessor():
    original_data = datacleaning.getdata()
    # 查找缺失值的行列，其中isnull().values获取行信息，any()获取列信息
    original_data.isnull().values.any()  # Gives false ie:No null value in dataset
    # 使用False进行填充
    original_data = original_data.fillna(value=0)
    print(original_data.describe())
    # 获取X与Y值
    original_X = pd.DataFrame(original_data.drop(['L'], axis=1))

    # sns.distplot(np.log(original_data['EDGE_COUNT']))
    # plt.show()
    sc = MinMaxScaler(feature_range=(0, 2))
    temp_X = sc.fit_transform(original_X)
    original_X = pd.DataFrame(data=temp_X, columns=original_X.columns)
    original_Y = original_data['L']
    original_Y = pd.DataFrame(original_Y)

    # correlation = original_data.corr()
    # correlation['L'].sort_values()

    # 特征相互之间的关系可视化
    temp_data = pd.DataFrame(original_data, columns=['BRANCH_COUNT', 'NODE_COUNT', 'EDGE_COUNT', 'L'])
    sns.pairplot(temp_data, kind='reg', diag_kind='kde', )
    plt.show()
    sns.set()
    # 获取各列之间的相关系数
    corr = original_data.corr()
    print("corr: ")
    print(np.array(corr.values))
    print(np.array(corr.columns))

    bar3d(corr)

    # # 画热力图
    # ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
    # # sns.violinplot(np.log(original_data['price']))
    # plt.show()

    # 这个区别方式适用于没有数据脱敏。这里因为数据脱敏，比如name应该是类别变量。所以此方法这里不适用，只能手动区分了

    # 使用imlbearn库中上采样方法中的SMOTE接口
    # 定义SMOTE模型，random_state相当于随机数种子的作用
    smo = SMOTE(random_state=42)
    original_X, original_Y = smo.fit_resample(original_X, original_Y)
    x_train, x_test, y_train, y_test = train_test_split(original_X, original_Y, test_size=0.1,
                                                        random_state=12)

    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1,
                                                      random_state=12)
    print(original_X.describe())

    combined_training_data = x_train.copy()
    combined_training_data['L'] = y_train

    return original_data, original_X, original_Y, combined_training_data, x_train, x_test, y_train, y_test


my_sdp_preprocessor()
